Import Library¶
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, VisualRepresentation
from bertopic.backend import MultiModalBackend
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
import hdbscan
import base64
from io import BytesIO
from IPython.display import HTML
from PIL import Image
import matplotlib.pyplot as plt
import math
Demonstration of Multimodal Topic Modeling¶
This visualization demonstrates the application of topic modeling to a dataset that, while not sourced from the dark web, covers themes related to weapons, drugs, and robbers.
The images used in this demonstration do not derive from the dark web.
The dataset used comes from: Roboflow's Drug Detection Project.
By associating these images with their corresponding topic labels, from dark web, this approach allows us to expand the analysis and enrich it with both textual and visual information.
This multimodal model offers a deeper understanding of complex datasets by integrating diverse data types and can be extended to further analyze and interpret data in various contexts.
The combination of text and imagery provides a robust framework for exploring and categorizing content in a more comprehensive manner.
1° Visual Model Baseline trained on 3k images¶
Clustering Approach¶
- Parameter Setting:
- Embedding Model for images: clip-ViT-B-32
- Representation Model: KeyBERTInspired, VisualRepresentation
- Visual Model: vit-gpt2-image-captioning with 300 nr_repr_images
- Count Vectorizer
- CtfIDF
- UMAP: 150 neighbors, 10 components
- HDBSCAN: 50 min cluster size
Clustering Results¶
- Clusters Retrieved: 4 which include:
- Gun
- Drug
- People with gun / drug
- People
Process Datasets¶
TRAIN_FOLDER = 'Datasets/RawData/train'
TEST_FOLDER = 'Datasets/RawData/test'
VALID_FOLDER = 'Datasets/RawData/valid'
def load_image_paths_and_annotations(image_folder: str, annotation_file: str) -> tuple:
"""
Load image paths and annotations.
:param image_folder: Folder containing the images.
:param annotation_file: CSV file containing the annotations.
:return: Tuple containing a list of image paths and a pandas DataFrame containing the annotations.
"""
annotations = pd.read_csv(annotation_file)
image_paths = []
labels = []
for _, row in tqdm(annotations.iterrows(), total=annotations.shape[0], desc='Loading image paths'):
try:
img_filename = row['filename']
img_path = os.path.join(image_folder, img_filename)
if os.path.exists(img_path):
image_paths.append(img_path)
labels.append(row['class'])
else:
print(f'Image file {img_filename} does not exist at {img_path}')
except Exception as e:
print(f'Error processing image {img_filename}: {e}')
continue
return image_paths, labels, annotations
def image_base64(im: str) -> str:
"""
Convert an image to base64.
:param im: Path to the image.
:return: Base64 encoding of the image.
"""
if isinstance(im, str):
im = get_thumbnail(im)
with BytesIO() as buffer:
im.save(buffer, 'jpeg')
return base64.b64encode(buffer.getvalue()).decode()
def image_formatter(im: str) -> str:
"""
Display an image in a Jupyter notebook.
:param im: Path to the image.
:return: HTML image tag.
"""
return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'
test_images, test_labels, test_annotations = load_image_paths_and_annotations(TEST_FOLDER, os.path.join(TEST_FOLDER, '_annotations.csv'))
val_images, val_labels, val_annotations = load_image_paths_and_annotations(VALID_FOLDER, os.path.join(VALID_FOLDER, '_annotations.csv'))
train_images, train_labels, train_annotations = load_image_paths_and_annotations(TRAIN_FOLDER, os.path.join(TRAIN_FOLDER, '_annotations.csv'))
images = train_images + test_images
labels = train_labels + test_labels
Prepare Model¶
Pre-Compute embeddings¶
'''
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
embedd = embedding_model.embed(documents=labels, images=images, verbose=True)
embedd_only_images = embedding_model.embed_images(images=images, verbose=True)
np.savez_compressed('Embeddings/embedding_images.npz', embedd)
np.savez_compressed('Embeddings/embedding_only_images.npz', embedd_only_images)
'''
with np.load('Embeddings/embedding_only_images.npz') as data1, np.load('Embeddings/embedding_images.npz') as data2:
embedd_only_images = data1['arr_0']
embedd = data2['arr_0']
Build BERTopic Model¶
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=150, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Embeddings image models
embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)
# Visual model
visual_model = VisualRepresentation(image_to_text_model="nlpconnect/vit-gpt2-image-captioning", nr_samples=20, nr_repr_images=500, image_height=800)
representation_model = {
"Visual_Aspect": visual_model,
"KeyBERTInspired": kw
}
topic_model = BERTopic(
min_topic_size=50,
top_n_words=5,
n_gram_range=(1, 3),
representation_model=representation_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
verbose=True)
topics, probs = topic_model.fit_transform(documents=labels, images=images, embeddings=embedd_only_images)
Show Results¶
# Extract dataframe
df = topic_model.get_topic_info().drop(["Representative_Docs", "Name", "Representation"], axis=1)[["Topic", "Count", "KeyBERTInspired", "Visual_Aspect"]]
# Visualize the images
HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))
| Topic | Count | KeyBERTInspired | Visual_Aspect | |
|---|---|---|---|---|
| 0 | -1 | 160 | [drug, people, , , , , , , , ] | |
| 1 | 0 | 2162 | [gun, , , , , , , , , ] | |
| 2 | 1 | 317 | [drug, , , , , , , , , ] | |
| 3 | 2 | 280 | [people, , , , , , , , , ] | |
| 4 | 3 | 71 | [people, drug, , , , , , , , ] |
Model Graphs¶
topic_model.visualize_barchart()
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embedd)
topic_model.visualize_documents(labels, reduced_embeddings=reduced_embeddings, hide_document_hover=True)
topic_model.visualize_document_datamap(labels, embeddings=embedd)
topic_model.visualize_term_rank(log_scale=True)
Save Model¶
topic_model.save("Models/topic_visual_model_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
topic_model.push_to_hf_hub(
repo_id="D0men1c0/ISSR_Visual_Model",
save_embedding_model=embedding_model,
save_ctfidf=True
)
Predict¶
topic_model = BERTopic.load('D0men1c0/ISSR_Visual_Model', embedding_model='clip-ViT-B-32')
topic_model.get_topic_info()
| Topic | Count | Name | Representation | KeyBERTInspired | Visual_Aspect | Representative_Docs | |
|---|---|---|---|---|---|---|---|
| 0 | -1 | 160 | -1_drug_people_gun_ | [drug, people, gun, , ] | [drug, people, , , , , , , , ] | <PIL.JpegImagePlugin.JpegImageFile image mode=... | NaN |
| 1 | 0 | 2162 | 0_gun_people_drug_ | [gun, people, drug, , ] | [gun, , , , , , , , , ] | <PIL.JpegImagePlugin.JpegImageFile image mode=... | NaN |
| 2 | 1 | 317 | 1_drug_gun__ | [drug, gun, , , ] | [drug, , , , , , , , , ] | <PIL.JpegImagePlugin.JpegImageFile image mode=... | NaN |
| 3 | 2 | 280 | 2_people_gun__ | [people, gun, , , ] | [people, , , , , , , , , ] | <PIL.JpegImagePlugin.JpegImageFile image mode=... | NaN |
| 4 | 3 | 71 | 3_people_gun_drug_ | [people, gun, drug, , ] | [people, drug, , , , , , , , ] | <PIL.JpegImagePlugin.JpegImageFile image mode=... | NaN |
topic, _ = topic_model.transform(val_labels, images=val_images)
all_topic_info = [topic_model.get_topic_info(t) for t in topic]
all_prediction_info = pd.concat(all_topic_info, ignore_index=True)
sample_images = 100
n_images = min(sample_images, len(val_images))
n_cols = 4
n_rows = math.ceil(n_images / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 3))
axes = axes.flatten() # Flatten the axes array for easier indexing
for i, (path, (_, row)) in enumerate(zip(val_images[:n_images], all_prediction_info.iterrows())):
ax = axes[i]
ax.imshow(Image.open(path))
ax.axis('off')
ax.set_title(f"Topic {row['Topic']}: {row['KeyBERTInspired'][0]}")
# Hide unused axes
for j in range(n_images, len(axes)):
axes[j].axis('off')
plt.tight_layout()
plt.show()